package com.apobates.jforum2.utils.nlp.impl.relate;

import com.apobates.jforum2.utils.nlp.AbstractTagRelateQueryExecutor;
import com.apobates.jforum2.utils.nlp.RelateWordStats;
import com.apobates.jforum2.utils.nlp.TagRelateResult;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Predicate;
import java.util.stream.Stream;

/**
 * 根据词频的相关分值来显示相关性
 *
 * @author xiaofanku
 * @since 20200111
 */
public class TagRelateDistance extends AbstractTagRelateQueryExecutor{
    //Key=文章ID, Value=Map{K=词,V=得分}
    private final Map<Long, Map<String, Integer>> dis;
    //private final static Logger logger = LoggerFactory.getLogger(TagRelateDistance.class);
    
    public TagRelateDistance(Map<String, Integer> wordFrequency) {
        super(wordFrequency);
        //logger.info("[QR][DIS][INIT]param size: " + wordFrequency.size());
        this.dis = Collections.emptyMap();
    }
    
    private TagRelateDistance(Map<String, Integer> wordFrequency, Map<Long, Map<String, Integer>> result){
        super(wordFrequency);
        this.dis = Map.copyOf(result);
    }
    
    @Override
    public Stream<TagRelateResult> getResult(int showSize) {
        //logger.info("[QR][DIS][RT]param Value: " + showSize);
        if (dis.isEmpty()) {
            return Stream.empty();
        }
        Stream<TagRelateResult> result = dis.entrySet().parallelStream().map((Entry<Long, Map<String, Integer>> entry) -> {
            //余弦相似度
            int index = 0;
            int tvs = entry.getValue().size();
            //logger.info("[QR][DIS][RT]Hit Size: " + tvs + ", Tages: " + String.join(",", entry.getValue().keySet()) + ", form Topic: " + entry.getKey());
            int[] targetWordRates = new int[tvs];
            int[] sourceWordRates = new int[tvs];
            for (Entry<String, Integer> enr : entry.getValue().entrySet()) {
                targetWordRates[index] = enr.getValue();
                sourceWordRates[index] = getRawdata().get(enr.getKey());
                index += 1;
            }
            double csVal = cosineSimiliarity(sourceWordRates, targetWordRates);
            return new TagRelateResult(entry.getKey(), 0, csVal);
        }).sorted(Comparator.comparing(TagRelateResult::getSimilarity).reversed()).limit(showSize);
        return applyWithIndex(result);
    }
    
    @Override
    public TagRelateDistance load(Collection<RelateWordStats> wordSource) {
        //logger.info("[QR][DIS][LOAD]param size: " + wordSource.size());
        final Map<String, Integer> targetWords = getRawdata();
        Predicate<RelateWordStats> predicate = rws -> targetWords.keySet().stream().anyMatch(tw->rws.getWord().equals(tw));
        Map<Long, Map<String, Integer>> calcResult = new ConcurrentHashMap<>();
        List.copyOf(wordSource)
                .parallelStream()
                .filter(predicate).forEach(rws->{
                    calcResult.computeIfAbsent(rws.getTopic(), k -> {
                        return new HashMap<>();
                    });
                    calcResult.get(rws.getTopic()).put(rws.getWord(), rws.getFrequency());});
        return new TagRelateDistance(targetWords, calcResult);
    }
    
    /**
     * 欧氏距离/欧几里得度量(n维空间的公式)
     * https://baike.baidu.com/item/%E6%AC%A7%E5%87%A0%E9%87%8C%E5%BE%97%E5%BA%A6%E9%87%8F
     *
     * @param sourceWordRates 原文的(某个)词频
     * @param targetWordRates 计算的话题的(某个)词频
     * @return
     */
    private double euclideanDistance(int[] sourceWordRates, int[] targetWordRates) {
        //起始的点值
        int startV = 0;
        for (int i = 0; i < sourceWordRates.length; i++) {
            int diff = sourceWordRates[i] - targetWordRates[i];
            startV += diff * diff;
        }
        return Math.sqrt(startV);
    }
    
    /**
     * 余弦相似度 http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
     *
     * @param sourceWordRates 原文的(某个)词频
     * @param targetWordRates 计算的话题的(某个)词频
     * @return
     */
    private double cosineSimiliarity(int[] sourceWordRates, int[] targetWordRates) {
        //分子
        int numerator = 0;
        for (int i = 0; i < sourceWordRates.length; i++) {
            numerator += sourceWordRates[i] * targetWordRates[i];
        }
        //分母int denominator=0;
        int swrSub = 0;
        int twrSum = 0;
        for (int i = 0; i < sourceWordRates.length; i++) {
            swrSub += sourceWordRates[i] * sourceWordRates[i];
            twrSum += targetWordRates[i] * targetWordRates[i];
        }
        //开方相乘得分母
        double denominator = Math.sqrt(swrSub) * Math.sqrt(twrSum);
        return numerator / denominator;
    }
}